import pandas as pd
import numpy as np
import json
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
from fa2 import ForceAtlas2
import community
import os
import sys
import copy
import re
import nltk
from nltk.corpus import stopwords
import progressbar
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import community
from PIL import Image
import random
The dataset used in the following analysis is a collection of Wookieepedia[6] articles and movie/TV scripts from the Star Wars universe. Wookieepedia is the unofficial Star Wars wiki and contains articles about everything related to Star Wars. It was the obvious choice to get information about both the movies and the characters that appear in them. From the articles for every movie and every TV-episode the characters that appeared were extracted to build a list of every character that has appeared in the Star Wars universe. Using that character list the Wookieepedia article for each character was downloaded.
For the scripts the webpage Springfield! Springfield![7] was used. It contains a huge collection of movie and TV scripts/transcripts that are available to download for everyone.
I chose Wookieepedia to get the majority of my data because it is the un-official wiki for everything Star Wars. It has articles about all the movies with a list of characters that appeared in them, and it has articles for each of those characters. It is even so simple that the list of characters in the movies actually links directly to the pages of those characters.
I would have liked to use Wookieepedia for the scripts just like I used it for the rest of the data. The problem was that the scripts on Wookieepedia were not all on the same format and some of them were incomplete. When I figured that out I found the Springfield! Springfield! webpage. It seems to have all the complete scripts for both the movies and each TV-episode. More importantly all of it is on the same format, which makes it easier to use for text analysis.
I wanted to see what the network built using the Wookieepedia data looked like and if there were any surprises to be revealed, f.ex. are the best connected characters those that we expect?
I also wanted to use text analysis to find out what words characterize the scripts of the movies and see if they can describe the story the movie tells in any way.
With sentiment analysis I wanted to do two things. Firstly, I wanted to make a timeline of the sentiment in the Star Wars galaxy by using each movie and tv episode as a data point. The sentiment from the movies can be calculated from their scripts. Secondly, I wanted to find out who the most happy/unhappy characters are by analysing the sentiment in their Wookieepedia articles and see if the result is something that can be expected.
When downloaded from their respective pages all the scripts are just plain text, so no cleaning or preprocessing was applied there.
To be able to download all the character pages the names of all the characters that have appeared in each of the movies and TV-series had to be extracted. This is done by downloading the Wookieepedia pages of each movie and each TV-episode. On these pages the characters are all listed in their respective sections that are start with either c-characters= or characters= and end with \n\n| or l-characters=. A regular expression is used to extract the links to each character page. The links are on the form [[link|optional title]]. The optional title for the links is only regarded in the case the link is Anakin Skywalker. This is done since the Anakin Skywalker page velong to both Anakin Skywalker and Darth Vader. In this case the optional title is processed to check if the link belongs to Skywalker or Vader.
When all the data has been downloaded it is stored in a .json file where the pages for each movie is keyed by the name of the movie, the pages for the tv-episodes is keyed by the series name and the episode name, and the pages for the characters are keyed by the name of the characters. This .json files contains the pages of 11 movies, 2 animated TV-series with a total of 195 episodes between them, and the pages for 1945 characters. The file is almost 23 MB.
The scripts are stored in their own .json that is keyed in a similar way as the other .json file. The 11 movies are keyed by their name and the 195 TV-episodes are keyed by the name of the series they belong to and the name of the episode. The .json file for the scripts is almost 2.5 MB.
For each of the 11 movies and 2 animated TV-series a network[1] will be built that describes how characters are linked together. The nodes in the networks are the characters and a link from one node A to node B means that there exists a link in the Wookieepedia page of character A that point to the Wookieepedia page of character B. What can these networks tell us about the films and TV-series? We can find out what characters have the most connections and take a look at if these are the characters that we expect to see, i.e. the characters that play a central role in the storyline of the movies or TV-series.
For text analysis TF-IDF will be used to generate a measure of how often a word is used across all the scripts. That measure along with all the words will be used to make a Word Cloud for each film and TV-series. Can we in some way relate the words in the Word Clouds to the storyline of the films and TV-series? The sentiment of the scripts are calculated and used to create a timeline of how the sentiment evolves between movies. Can this timeline be explained in any way with the storyline of each individual movie in mind? For further sentiment analysis the Wookieepedia pages of each character is analyzed to find out how the most positive/negative characters are. Can we explain the results, or is Wookieepedia not a good source to analyze sentiment in the characters?
# Function to calculate TF-IDF
def TFIDF(dataframes):
dataframes_tfidf = []
all_words = []
#Caclulate TF
for i in range(len(dataframes)):
df = dataframes[i]
df_tfidf = df.groupby("word").size()
df_words = df_tfidf.index
df_counts = df_tfidf.values
df_tfidf = pd.DataFrame(data=dict(word=df_words, TF=df_counts))
df_tfidf.sort_values(by="TF", ascending=False)
df_tfidf.reset_index(inplace=True)
dataframes_tfidf.append(df_tfidf)
all_words.extend(df_words)
df_all = pd.DataFrame(data=dict(word=all_words))
df_all = df_all.groupby("word").size()
df_all_words = df_all.index
df_all_counts = df_all.values
df_all = pd.DataFrame(data=dict(word=df_all_words,
count=df_all_counts))
#Calculate IDF
N = len(dataframes)
for i in range(len(dataframes_tfidf)):
df = dataframes_tfidf[i]
df_merged = pd.merge(df, df_all, on="word")
df_merged["IDF"] = N/(1+np.log(df_merged["count"]))
df_merged["TFIDF"] = df_merged["TF"] * df_merged["IDF"]
df_merged.sort_values(by="TFIDF", ascending=False)
dataframes_tfidf[i] = df_merged
return dataframes_tfidf
# Download the list of word sentiments
word_sentiments = pd.read_csv("sentiment.csv")[["word", "happiness_rank", "happiness_average", "happiness_standard_deviation"]]
# A function that calculates the sentiment of a list of tokens
def evaluate_sentiment(tokens):
# Convert tokens list to dataframe
df = pd.DataFrame(data=dict(word=tokens))
# Count how many times each word is mentioned
df2 = df.groupby("word").size()
words = df2.index.values
# Convert counts to frequency
freq = df2.values / float(len(tokens))
# DataFrame with word frequencies
df_final = pd.DataFrame(data=dict(word=words, freqs=freq))
# Calculate the sentiment using word frequency and word happiness
df_final = df_final.merge(word_sentiments, on="word")
df_final["havg"] = df_final["happiness_average"]*df_final["freqs"]
return sum(df_final["havg"])
Here the networks are created with the NetworkX library[10] in python. The nodes in the networks are the characters that appear in the movies, and the edges are the links that exist between characters according to the Wookieepedia analysis. Near end of this section there is an output field showing the size and average node degree of each of the networks that is created. The Force Atlas algorithm[9] is used to calculate the positions of each node in the network. These positions are then used when the networks are visualized.
#Load the data for the network
with open("Data/starwars_vader_split.json") as f:
starwars_json = json.load(f)
# Define a list of movies and tv-series
movies = ["Star Wars: Episode I The Phantom Menice", "Star Wars: Episode II Attack of the Clones",
"Star Wars: The Clone Wars (film)", "Star Wars: Episode III Revenge of the Sith",
"Solo: A Star Wars Story", "Rogue One: A Star Wars Story", "Star Wars: Episode IV A New Hope",
"Star Wars: Episode V The Empire Strikes Back", "Star Wars: Episode VI Return of the Jedi",
"Star Wars: Episode VII The Force Awakens", "Star Wars: Episode VIII The Last Jedi"]
animated_series_names = ["Star Wars: The Clone Wars (series)", "Star Wars Rebels"]
movies = ["Star Wars: Episode I The Phantom Menice", "Star Wars: Episode II Attack of the Clones",
"Star Wars: The Clone Wars (film)", "Star Wars: Episode III Revenge of the Sith",
"Solo: A Star Wars Story", "Rogue One: A Star Wars Story", "Star Wars: Episode IV A New Hope",
"Star Wars: Episode V The Empire Strikes Back", "Star Wars: Episode VI Return of the Jedi",
"Star Wars: Episode VII The Force Awakens", "Star Wars: Episode VIII The Last Jedi"]
animated_series_names = ["Star Wars: The Clone Wars (series)", "Star Wars Rebels"]
digraphs = defaultdict(dict)
all_media = movies
all_media.extend(animated_series_names)
# Create graphs for each film
exclude_from_nodes = set(["canon", "hologram", "ABC Television Network"])
nodes = []
# Loop over all the movies and series
for m in all_media:
# Instantiate an empy Directed Graph
DG = nx.DiGraph()
# Create a list of all the characters in the current movie/serie
nodes = list(set(starwars_json["data"]["characters"].keys()).difference(set(starwars_json["failed"]["characters"].keys()).union(exclude_from_nodes)))
for n in nodes:
# Find what movies and TV-series that character has appeared in
movie = [x for x in starwars_json["data"]["movies"].keys() if n in starwars_json["data"]["movies"][x]["characters"]]
series = defaultdict(list)
for s in starwars_json["data"]["series"].keys():
for e in starwars_json["data"]["series"][s]:
if n in starwars_json["data"]["series"][s][e]["characters"]:
series[s].append(e)
# Find the affiliations of the character to store as node attribute
affiliations = starwars_json["data"]["characters"][n]["affiliations"]
if len(affiliations) < 1:
affiliations = ["Unknown"]
attribute = {"affiliations": affiliations}
# Add character as node if it appears in the current movie/tv-serie
if (m in movie) or (m in series):
DG.add_node(n, data=attribute)
# Create links between nodes
for n in DG.nodes():
links = list(set(starwars_json["data"]["characters"][n]["connections"]).difference(
set(starwars_json["failed"]["characters"].keys()).union(exclude_from_nodes)).intersection(DG.nodes()))
# Handle the special case of Darth Vader and Anakin Skywalker
if n == "Darth Vader":
new_links = []
for l in links:
# Find what links point to Darth Vader and not Anakin Skywalker
if "Darth Vader" in starwars_json["data"]["characters"][l]["connections"] or "Anakin Skywalker" not in starwars_json["data"]["characters"][l]["connections"]:
new_links.append(l)
links = new_links
if n == "Anakin Skywalker":
new_links = []
for l in links:
# Find what links point to Anakin Skywalker and not Darth Vader
if "Darth Vader" not in starwars_json["data"]["characters"][l]["connections"] or "Anakin Skywalker" in starwars_json["data"]["characters"][l]["connections"]:
new_links.append(l)
links = new_links
if len(links) > 0:
for l in links:
DG.add_edge(n, l)
digraphs[m]["graph"] = DG
print m
print "Nodes:", str(len(DG.nodes()))
print "Edges:", str(len(DG.edges()))
print "AVG degree:", np.mean(zip(*DG.degree())[1])
print
print
#Create graph of the entire joint universe using the same procedure as above
DG = nx.DiGraph()
nodes = list(set(starwars_json["data"]["characters"].keys()).difference(set(starwars_json["failed"]["characters"].keys()).union(exclude_from_nodes)))
for n in nodes:
# if n == "Darth Vader":
# print starwars_json["data"]["characters"][n]
movie = [x for x in starwars_json["data"]["movies"].keys() if n in starwars_json["data"]["movies"][x]["characters"]]
series = defaultdict(list)
for s in starwars_json["data"]["series"].keys():
for e in starwars_json["data"]["series"][s]:
if n in starwars_json["data"]["series"][s][e]["characters"]:
series[s].append(e)
affiliations = starwars_json["data"]["characters"][n]["affiliations"]
if len(affiliations) < 1:
affiliations = ["Unknown"]
attribute = {"affiliations": affiliations}
DG.add_node(n, data=attribute)
# Create links between nodes
for n in DG.nodes():
links = list(set(starwars_json["data"]["characters"][n]["connections"]).difference(
set(starwars_json["failed"]["characters"].keys()).union(exclude_from_nodes)))
if n == "Darth Vader":
new_links = []
for l in links:
if "Darth Vader" in starwars_json["data"]["characters"][l]["connections"] or "Anakin Skywalker" not in starwars_json["data"]["characters"][l]["connections"]:
new_links.append(l)
links = new_links
if n == "Anakin Skywalker":
new_links = []
for l in links:
if "Darth Vader" not in starwars_json["data"]["characters"][l]["connections"] or "Anakin Skywalker" in starwars_json["data"]["characters"][l]["connections"]:
new_links.append(l)
links = new_links
if len(links) > 0:
for l in links:
DG.add_edge(n, l)
digraphs["all"]["graph"] = DG
print "Everything"
print "Nodes:", str(len(DG.nodes()))
print "Edges:", str(len(DG.edges()))
print "AVG degree:", np.mean(zip(*DG.degree())[1])
# If statement to disable the Force Atlas algorithm to save time. The generated .json file from previous runs will be used.
if 0:
# Use the Force Atlas algorithm to calculate node positions for each of the networks
for dig in digraphs.keys():
print
print dig
DG = digraphs[dig]["graph"]
# Setup for the force atlas algorithm.
forceatlas2 = ForceAtlas2(
# Behavior alternatives
outboundAttractionDistribution=False, # Dissuade hubs
linLogMode=False, # NOT IMPLEMENTED
adjustSizes=False, # Prevent overlap (NOT IMPLEMENTED)
edgeWeightInfluence=1.0,
# Performance
jitterTolerance=0.01, # Tolerance
barnesHutOptimize=True,
barnesHutTheta=0.12,
multiThreaded=False, # NOT IMPLEMENTED
# Tuning
scalingRatio=0.02,
strongGravityMode=False,
gravity=1.0,
# Log
verbose=True)
# Create an undirected version of the graph
G = DG.to_undirected()
# Calculate node positions with the force atlas algorithm using the undirected graph
positions = forceatlas2.forceatlas2_networkx_layout(G, pos=None, iterations=20000)
pos_for_json = {}
for x in positions:
pos_for_json[x] = list(positions[x])
digraphs[dig]["node_pos"] = pos_for_json
# Convert network data to json serializeable data and save to a file
dgraphs = copy.deepcopy(digraphs)
for dig in dgraphs.keys():
dgraphs[dig]["graph"] = nx.readwrite.json_graph.node_link_data(DG)
with open(os.path.join("Data", "movie_networks.json"), 'w') as f:
json.dump(dgraphs, f)
# Read the json file that contains data for the node positions
with open(os.path.join("Data", "movie_networks.json"), 'r') as f:
starwars_networks = json.load(f)
# Plot all the networks
for dig in digraphs.keys():
DG = digraphs[dig]["graph"]
nodes = DG.nodes(data=True)
nodes_2, degrees = zip(*DG.degree())
# node size proportional to the degree
node_sizes_deg = [7*x for x in degrees]
# # Color according to the party
# node_colors = ['r' if x[1]['party'] == "Republican" else "b" for x in nodes]
# nx.draw(DG, positions, node_color=node_colors, node_size=node_sizes_deg, with_labels=False, edgecolors="white", edge_color='k', width=0.1)
if dig in ("all","Star Wars: The Clone Wars (series)"):
plt.figure(figsize=(50,30))
elif dig == "Star Wars Rebels":
plt.figure(figsize=(30,30))
else:
plt.figure(figsize=(20,20))
# nx.draw(DG, digraphs[dig]["node_pos"], node_color='red', node_size=node_sizes_deg, with_labels=False, edgecolors="white", edge_color='k', width=0.1)
nx.draw(DG, starwars_networks[dig]["node_pos"], node_color='red', node_size=node_sizes_deg, with_labels=False, edgecolors="white", edge_color='k', width=0.1)
# plt.title(dig)
print dig
directory = os.path.join("Data", dig.replace(" ", "_").replace(":",""))
if not os.path.exists(directory):
os.makedirs(directory)
plt.savefig(os.path.join(directory, "network.png"))
plt.show()
The networks are used to find out what characters are the best connected ones in each of the Star Wars films and TV-series. We calculate who is best connected with a few different metrics. We use:
Note that the metrics based on degree only considers the neighbors of a node, but the eigenvector and betweenness centrality consider the nodes in a bigger context. They consider the influence of the node based on all the other nodes in the network. At the end of the next cell tables with the top 10 characters for each network as shown, one for each of these metrics. The network node degree distributions are also shown.
# Find out who the most connected characters are for each graph
for d in digraphs.keys():
print "\n\n\n"
print d
DG = digraphs[d]["graph"]
nodes = DG.nodes(data=True)
nodes_2, degrees = zip(*DG.degree())
# Calculate degree centrality
deg_cent = nx.degree_centrality(DG)
degree_centrality = [deg_cent[n] for n in nodes_2]
# Calculate eigenvector centrality
eig_cent = nx.eigenvector_centrality(DG)
eigenvector_centrality = [eig_cent[n] for n in nodes_2]
# Calculate betweenness centrality
bet_cent = nx.betweenness_centrality(DG)
betweenness_centrality = [bet_cent[n] for n in nodes_2]
# Get In and Out node degrees
in_deg = DG.in_degree()
out_deg = DG.out_degree()
in_degree = [in_deg[n] for n in nodes_2]
out_degree = [out_deg[n] for n in nodes_2]
df = pd.DataFrame(data=dict(name=nodes_2, degree=degrees,
degree_centrality=degree_centrality,
eigenvector_centrality=eigenvector_centrality,
betweenness_centrality=betweenness_centrality,
in_degree=in_degree,
out_degree=out_degree))
directory = os.path.join("Data", d.replace(" ", "_").replace(":",""))
if not os.path.exists(directory):
os.makedirs(directory)
writer = pd.ExcelWriter(os.path.join(directory, "centrality.xlsx"), engine='xlsxwriter')
df.to_excel(writer, sheet_name='Sheet1')
# Display the top 10 connected characters based on the different metrics calculated above
from IPython.display import display, HTML
df = df.sort_values(by="degree", ascending=False)
print "10 highest degree"
display(HTML(df[["name", "degree"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_degree.txt"), 'w') as f:
f.write(df[["name", "degree"]][:10].to_html(index=False).encode('utf-8'))
df = df.sort_values(by="in_degree", ascending=False)
print "10 highest in-degree"
display(HTML(df[["name", "in_degree"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_in_degree.txt"), 'w') as f:
f.write(df[["name", "in_degree"]][:10].to_html(index=False).encode('utf-8'))
df = df.sort_values(by="out_degree", ascending=False)
print "10 highest out-degree"
display(HTML(df[["name", "out_degree"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_out_degree.txt"), 'w') as f:
f.write(df[["name", "out_degree"]][:10].to_html(index=False).encode('utf-8'))
df = df.sort_values(by="degree_centrality", ascending=False)
print "10 highest degree centrality"
display(HTML(df[["name", "degree_centrality"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_deg_cent.txt"), 'w') as f:
f.write(df[["name", "degree_centrality"]][:10].to_html(index=False).encode('utf-8'))
df = df.sort_values(by="eigenvector_centrality", ascending=False)
print "10 highest eigenvector centrality"
display(HTML(df[["name", "eigenvector_centrality"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_eig_cent.txt"), 'w') as f:
f.write(df[["name", "eigenvector_centrality"]][:10].to_html(index=False).encode('utf-8'))
df = df.sort_values(by="betweenness_centrality", ascending=False)
print "10 highest betweenness_centrality"
display(HTML(df[["name", "betweenness_centrality"]][:10].to_html(index=False)))
with open(os.path.join(directory, "top10_betw_cent.txt"), 'w') as f:
f.write(df[["name", "betweenness_centrality"]][:10].to_html(index=False).encode('utf-8'))
# Calculate degree distributions
degree = zip(*DG.degree())[1]
in_degree = zip(*DG.in_degree())[1]
out_degree = zip(*DG.out_degree())[1]
bins = np.arange(min(degree), max(degree)+1)
hist, bins = np.histogram(degree, bins=bins)
plt.figure(figsize=(16,9))
plt.loglog(bins[:-1], hist, 'o')
plt.xlabel("Degree", fontsize=24)
plt.ylabel("Frequency (Count)", fontsize=24)
plt.title("Degree Distribution", fontsize=26)
plt.tick_params(labelsize=20, length=12, width=3, which="major")
plt.tick_params(length=7, width=3, which="minor")
plt.savefig(os.path.join(directory, "degree_distribution.png"))
in_bins = np.arange(min(in_degree), max(in_degree)+1)
in_hist, in_bins = np.histogram(in_degree, bins=in_bins)
plt.figure(figsize=(16,9))
plt.loglog(in_bins[:-1], in_hist, 'o')
plt.xlabel("In-Degree", fontsize=24)
plt.ylabel("Frequency (Count)", fontsize=24)
plt.title("In-Degree Distribution", fontsize=26)
plt.tick_params(labelsize=20, length=12, width=3, which="major")
plt.tick_params(length=7, width=3, which="minor")
plt.savefig(os.path.join(directory, "in_degree_distribution.png"))
out_bins = np.arange(min(out_degree), max(out_degree)+1)
out_hist, out_bins = np.histogram(out_degree, bins=out_bins)
plt.figure(figsize=(16,9))
plt.loglog(out_bins[:-1], out_hist, 'o')
plt.xlabel("Out-Degree", fontsize=24)
plt.ylabel("Frequency (Count)", fontsize=24)
plt.title("Out-Degree Distribution", fontsize=26)
plt.tick_params(labelsize=20, length=12, width=3, which="major")
plt.tick_params(length=7, width=3, which="minor")
plt.savefig(os.path.join(directory, "out_degree_distribution.png"))
# Save degree distributions to json
degree_dist_json = {"data": {"In_Degree": {"bins": [float(x) for x in bins[:-1]], "hist": [float(x) for x in hist]},
"Out_Degree": {"bins": [float(x) for x in out_bins[:-1]], "hist": [float(x) for x in out_hist]},
"Degree": {"bins": [float(x) for x in in_bins[:-1]], "hist": [float(x) for x in in_hist]}}}
with open(os.path.join(directory, "degree_distribution.json"), 'w') as f:
json.dump(degree_dist_json, f)
Looking at all the tables that show the top 10 characters based on the different metrics that are described above, we can see that in most cases the top connected characters are those that play a central role in the storyline of the films. These are both the characters that have a lot of screen time, but also characters that see lower screen time but are never the less very important for the story as a whole. There are some exceptions to this. In a few cases we see other characters that have become some what of fan favorites. These characters do not neccissarily play a big role in anything. An example of that is Sy Snootles who appears in the top 10 characters in Episode VI based on betweenness centrality. She is the singer for The Max Rebo Band that played during the scene within Jabba's Palace.
When the Wookieepedia pages were processed for links between characters the affiliations of each character were also collected. These affiliations were stored as node attributes when creating the networks. Lets take a look at the network for the entire universe, i.e. the single network that combines all the films and TV-series. Since the each character can have more than one affiliations it can make the creation of communities from them harder. Therefore only the first affiliation mentioned in each article is considered. We will visualize the networks and assign colors to the nodes depending on what affiliation they belong to. We will also calculate the optimal split into communities for the network and visualize it in the same way. For optimal split the Louvain algorithm[11] is used. For both community splits the modularity, $M$[1(section 9.4)], is calculated to identify which split has more pure communities. It would be optimal to make a confusion matrix to take a look at the communities that have formed, but since we have over 200 communities it would be impossible to visualize in a good way.
# Make a list of all the affiliations that are mentioned first in each article
DG = digraphs["all"]["graph"]
nodes = DG.nodes(data=True)
new_aff = defaultdict(dict)
for n in nodes:
c = n[0]
a = n[1]["data"]["affiliations"][0]
new_aff[c]["affil"] = a
# Update node attributes with the single affiliation
nx.set_node_attributes(DG, new_aff)
# Count the number of members associated with each affiliation
affiliations_count = defaultdict(int)
all_affiliations = []
nodes = DG.nodes(data=True)
# print nodes
for n in nodes:
all_affiliations.append(n[1]["affil"])
affiliations_count[n[1]["affil"]] += 1
all_affiliations = list(set(all_affiliations))
# Store the data in a pandas dataframe
affiliation_df = pd.DataFrame(data={"group": all_affiliations, "count": [affiliations_count[c] for c in all_affiliations]})
affiliation_df = affiliation_df[(affiliation_df["count"]>0)].sort_values(by="count", ascending=False)[["group", "count"]]
G = DG.to_undirected()
# Calculate the modularity coefficient of the partition that is based on affiliations
M = 0.0
L = float(len(G.edges()))
nodes = DG.nodes(data=True)
for com in affiliation_df["group"].values :
list_nodes = [n[0] for n in nodes
if n[1]["affil"] == com]
Lc = 0.0
kc = 0.0
for i in range(len(list_nodes)-1):
character = list_nodes[i]
kc += G.degree(character)
for neighbor in G.neighbors(character):
if neighbor in list_nodes[i+1:]:
Lc += 1.0
M += ((Lc/L) - np.power((kc/(2*L)), 2))
print("The modularity of the community partition based on affiliation is: M={0}".format(M))
# Find the best community partition
G = DG.to_undirected()
partition = community.best_partition(G)
# Calculate the modularity coefficient of this optimal partition
M = 0.0
L = float(len(G.edges()))
com_size = defaultdict(int)
for com in set(partition.values()) :
list_nodes = [nodes for nodes in partition.keys()
if partition[nodes] == com]
Lc = 0.0
kc = 0.0
for i in range(len(list_nodes)-1):
character = list_nodes[i]
kc += G.degree(character)
for neighbor in G.neighbors(character):
if neighbor in list_nodes[i+1:]:
Lc += 1.0
M += ((Lc/L) - np.power((kc/(2*L)), 2))
print("The modularity of the optimal community partition is: M={0}".format(M))
# Calculate the sizes of the optimal communities and store results in a dataframe
for com in partition.values():
com_size[com] += 1
df_community_size = pd.DataFrame(data={"com":com_size.keys(), "count":[com_size[x] for x in com_size.keys()]})
df_community_size = df_community_size[["com", "count"]].sort_values(by="count", ascending=False)
# Read the json file that contains data for graph regeneration
with open(os.path.join("Data", "movie_networks.json"), 'r') as f:
starwars_networks = json.load(f)
nodes = DG.nodes(data=True)
nodes_2, degrees = zip(*DG.degree())
# node size proportional to the degree
node_sizes_deg = [10*x for x in degrees]
# # Color according to the party
node_colors = [partition[n] for n in nodes_2]
# print node_colors
cmap = plt.get_cmap("jet")
max_partition = float(max(node_colors))
biggest_coms = list(df_community_size[df_community_size["count"]>9]["com"].values)
node_colors = [cmap(biggest_coms.index(col)/float(len(biggest_coms))) if col in biggest_coms else (1.0, 1.0, 1.0, 1.0) for col in node_colors]
# print node_colors
# nx.draw(DG, positions, node_color=node_colors, node_size=node_sizes_deg, with_labels=False, edgecolors="white", edge_color='k', width=0.1)
plt.figure(figsize=(50,30))
print "Optimal communities"
nx.draw(DG, starwars_networks["all"]["node_pos"], node_color=node_colors, node_size=node_sizes_deg, with_labels=False, edgecolors="k", edge_color='k', width=0.1)
# plt.title(dig)
directory = os.path.join("Data", "all")
if not os.path.exists(directory):
os.makedirs(directory)
plt.savefig(os.path.join(directory, "network_optimal_partitions.png"))
nodes = DG.nodes(data=True)
nodes_2, degrees = zip(*DG.degree())
# node size proportional to the degree
node_sizes_deg = [10*x for x in degrees]
# # Color according to the party
possible_affiliations = affiliation_df[(affiliation_df["count"]>9)]
temp = dict(DG.nodes(data=True))
node_colors = [temp[n]["affil"] for n in nodes_2]
# print node_colors
cmap = plt.get_cmap("jet")
biggest_coms = list(possible_affiliations["group"].values)
node_colors = [cmap(biggest_coms.index(col)/float(len(biggest_coms))) if col in biggest_coms else (1.0, 1.0, 1.0, 1.0) for col in node_colors]
# print node_colors
# nx.draw(DG, positions, node_color=node_colors, node_size=node_sizes_deg, with_labels=False, edgecolors="white", edge_color='k', width=0.1)
plt.figure(figsize=(50,30))
print "Communities based on affiliations"
nx.draw(DG, starwars_networks["all"]["node_pos"], node_color=node_colors, node_size=node_sizes_deg, with_labels=False, edgecolors="k", edge_color='k', width=0.1)
# plt.title(dig)
directory = os.path.join("Data", "all")
if not os.path.exists(directory):
os.makedirs(directory)
plt.savefig(os.path.join(directory, "network_affiliation_partitions.png"))
Lets start by looking at the modularity, $M$, of the two different community splits. The modularities are:
We can see that since M>0.0 for both splits either one of them could be used as communities. The communities that are based on affiliations are not optimal because the modularity is not high enough. The optimal split creates better communities because it has a considerably higher modularity coefficient.
The same conclusion can be drawn from looking at the visualization of the two different splits. The optimal community visualization shows that the communities are very well grouped together with only a small overlap between some of the communities. This is not the case for the affiliation visualization. There the communiteis are more irratic. They are not as well grouped together as the optimal communities are and the overlap between different communities is much larger.
Note that because of the high number of different communities it was not possible to assign a color to each one of them. Every community that has 10 or more members have their own colors, but the other communities that have 9 or less members are all displayed as white nodes.
The sentiment of a text tells us something about how happy/unhappy or positive/negative the contents of the text is. We calculate the sentiment in a text using the Hedonometer algorithm[12]. To get the happyness of words Data Set 1 from the same article is used. The happiness of a text, $T$, is calculated as: $h_{avg}(T)\sum_{i=0}^N h_{avg}(w_i)p_i$ where $p_i$ is the normalized frequency of word $w_i$ and $h_{avg}(w_i)$ is the average happiness of word $w_i$.
Lets find the top 5 and bottom 5 characters based on the sentiment calculated using their Wookieepedia pages. Do the results make any sense based on my knowledge of the Star Wars universe?
Before the sentiment can be calculated the texts need to be tokenized. That is done with the use of regular expressions[13] and then filtering some specific words out afterwards. The regular expression used is:
r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| [A-Za-z]\w*(?:-\w+)* # words with optional internal hyphens
'''
The regular extressions matcehs abbreviations wiht all capital letters and more general words that start with a letter. After the initial letter there can be either more letters or alphanumberic digits. This filters out every numbers so we don't have to do that in a loop later on. After tokenization we filter out all of the stopwords in the text using the list of stopwords in the NLTK[14] python library. Using the list of tokens/words the sentiment can easily be calculated. Instead of considering all of the 1945 characters we only consider the top 50 important characters based on degree centrality. We do this since the most important characters tend to have more detailed Wookieepedia pages and are therefore a better source to analyze for sentiment.
# ADD SENTIMENT TO THE JSON
dataframes = []
char_sentiment = defaultdict(dict)
chars = list(set(starwars_json["data"]["characters"].keys()).difference(set(starwars_json["failed"]["characters"].keys()).union(exclude_from_nodes)))
for i in range(len(chars)):
c = chars[i]
text = starwars_json["data"]["characters"][c]["wikicontent"]
tokens = []
# Only detect words and abbreviations, not numbers or punctuations
pattern = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| [A-Za-z]\w*(?:-\w+)* # words with optional internal hyphens
'''
stopw = stopwords.words('english')
other_words = []
# Tokenize the text with regex
tokens = nltk.regexp_tokenize(text, pattern)
# Exclude twitter handles and stopwords from the list of tokens and set the token to lowercase
tokens = [t.lower() for t in tokens if ((t.lower() not in stopw) and (t.lower() not in other_words))]
# Create a dataframe from the tokens and calculate sentiment
df_tokens = pd.DataFrame(data=dict(word=tokens))
dataframes.append(df_tokens)
char_sentiment[c]["tokens"] = df_tokens
char_sentiment[c]["sentiment"] = evaluate_sentiment(tokens)
DG = digraphs["all"]["graph"]
nodes = DG.nodes(data=True)
nodes_2, degrees = zip(*DG.degree())
deg_cent = nx.degree_centrality(DG)
degree_centrality = [deg_cent[n] for n in nodes_2]
eig_cent = nx.eigenvector_centrality(DG)
eigenvector_centrality = [eig_cent[n] for n in nodes_2]
df = pd.DataFrame(data=dict(name=nodes_2, degree=degrees,
degree_centrality=degree_centrality,
eigenvector_centrality=eigenvector_centrality))
df = df.sort_values(by="degree_centrality", ascending=False)
sentiment_list = []
for n in df["name"].values:
sentiment_list.append(char_sentiment[n]["sentiment"])
df["sentiment"] = sentiment_list
print "Top 5 characters based on sentiment"
print df[["name", "sentiment"]][:50].sort_values(by="sentiment", ascending=False)[:5]
print "Bottom 5 characters based on sentiment"
print df[["name", "sentiment"]][:50].sort_values(by="sentiment", ascending=False)[-5:]
Looking at the top 5 happiest/most positive characters we notice they all belong to factions that are aligned with the light side of the force, i.e. they are the good guys. The opposite is true for the 5 most unhappy/negative. They are characters that are considered as the bad guys, except for Rex. Rex was a clone captain in the 501st legion. Perhaps he appears as negative since all of his appearances have been related to the Clone Wars, a time of conflict that most likely affects the contents of his Wookieepedia page.
Can the most used words in the scripts of each film/TV-series tell us something? Do they in someway describe the plot of the film? To figure that out we can use Word Clouds[8] to display the most used words. To find the most used words we have to tokenize the scripts first. We use the same mehtod as we used to tokenize the character pages described above. From the list of words for each film/TV-series we can use a method called TF-IDF[15] (Term Freqeucny - Inverse Document Frequency) where we consider each film/TV-series as a document. The TF-IDF score tells us something about have often a word is used. More than that it also takes into account in how many of the documents it is used in and assigns high score to words that are used often, but in few documents.
#Read the scripts json
with open("starwars_transcripts_springfield.json", 'r') as f:
transcript_json = json.load(f)
# Define a list of movies and series and order them by time
movies = ["Star Wars: Episode I The Phantom Menice", "Star Wars: Episode II Attack of the Clones",
"Star Wars: The Clone Wars (film)", "Star Wars: Episode III Revenge of the Sith",
"Solo: A Star Wars Story", "Rogue One: A Star Wars Story", "Star Wars: Episode IV A New Hope",
"Star Wars: Episode V The Empire Strikes Back", "Star Wars: Episode VI Return of the Jedi",
"Star Wars: Episode VII The Force Awakens", "Star Wars: Episode VIII The Last Jedi"]
animated_series_names = ["Star Wars: The Clone Wars (series)", "Star Wars Rebels"]
media_in_time_order = [("movies", "Star Wars: Episode I The Phantom Menice"), ("movies", "Star Wars: Episode II Attack of the Clones"),
("movies", "Star Wars: The Clone Wars (film)"),
("series", "Star Wars: The Clone Wars (series)"),
("movies", "Star Wars: Episode III Revenge of the Sith"),
("movies", "Solo: A Star Wars Story"),
("series", "Star Wars Rebels"),
("movies", "Rogue One: A Star Wars Story"), ("movies", "Star Wars: Episode IV A New Hope"),
("movies", "Star Wars: Episode V The Empire Strikes Back"), ("movies", "Star Wars: Episode VI Return of the Jedi"),
("movies", "Star Wars: Episode VII The Force Awakens"), ("movies", "Star Wars: Episode VIII The Last Jedi")]
# Tokenize all the transcripts
transcript_tokens = defaultdict(dict)
dataframes = []
for m in movies:
script = transcript_json["data"]["movies"][m]["transcript"]
tokens = []
# Only detect words and abbreviations, not numbers or punctuations
pattern = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| [A-Za-z]\w*(?:-\w+)* # words with optional internal hyphens
'''
stopw = stopwords.words('english')
other_words = []
# Tokenize the text with regex
tokens = nltk.regexp_tokenize(script, pattern)
# Exclude twitter handles and stopwords from the list of tokens and set the token to lowercase
tokens = [t.lower() for t in tokens if ((t.lower() not in stopw) and (t.lower() not in other_words))]
df_tokens = pd.DataFrame(data=dict(word=tokens))
dataframes.append(df_tokens)
transcript_tokens[m]["tokens"] = df_tokens
transcript_tokens[m]["sentiment"] = evaluate_sentiment(tokens)
for s in animated_series_names:
all_tokens = []
for e in transcript_json["data"]["series"][s]:
script = transcript_json["data"]["series"][s][e]["transcript"]
enumber = transcript_json["data"]["series"][s][e]["episode"]
tokens = []
# Only detect words and abbreviations, not numbers or punctuations
pattern = r'''(?x) # set flag to allow verbose regexps
(?:[A-Z]\.)+ # abbreviations, e.g. U.S.A.
| [A-Za-z]\w*(?:-\w+)* # words with optional internal hyphens
'''
stopw = stopwords.words('english')
other_words = []
# Tokenize the text with regex
tokens = nltk.regexp_tokenize(script, pattern)
# Exclude twitter handles and stopwords from the list of tokens and set the token to lowercase
tokens = [t.lower() for t in tokens if ((t.lower() not in stopw) and (t.lower() not in other_words))]
df_tokens = pd.DataFrame(data=dict(word=tokens))
# dataframes.append(df_tokens)
all_tokens.extend(tokens)
transcript_tokens[s][e] = {"tokens": df_tokens, "episode": enumber, "sentiment": evaluate_sentiment(tokens)}
df_all_tokens = pd.DataFrame(data=dict(word=all_tokens))
dataframes.append(df_all_tokens)
transcript_tokens[s]["all_tokens"] = df_all_tokens
# Calculate TF-IDF for each movie
tfidf_dataframes = TFIDF(dataframes)
for i in range(len(movies)):
transcript_tokens[movies[i]]["TFIDF"] = tfidf_dataframes[i]
j = i+1
# Calculate TF-IDF for each TV-series
for x in animated_series_names:
transcript_tokens[x]["TFIDF"] = tfidf_dataframes[j]
j += 1
# for e in transcript_json["data"]["series"][s]:
# transcript_tokens[s][e]["TFIDF"] = tfidf_dataframes[j]
# j += 1
# Make Word Clouds with masks to make them look like Star Wars
def grey_color_func(word, font_size, position, orientation, random_state=None,
**kwargs):
return "hsl(0, 0%%, %d%%)" % random.randint(60, 100)
masks = ["Data/WordCloudMasks/maul_mask.jpg", "Data/WordCloudMasks/clone_trooper_mask.jpg", "Data/WordCloudMasks/ahsoka3.jpg",
"Data/WordCloudMasks/yoda_mask2.jpg", "Data/WordCloudMasks/chewy_mask.jpg", "Data/WordCloudMasks/darth_vader2.jpg",
"Data/WordCloudMasks/xwing.jpg", "Data/WordCloudMasks/stormtrooper_mask.png", "Data/WordCloudMasks/R2D2_mask.jpg",
"Data/WordCloudMasks/bb8_mask.jpg", "Data/WordCloudMasks/star_destroyer_mask.jpg"]
coloring = ["Data/WordCloudMasks/maul_colors.jpg", "Data/WordCloudMasks/clone_trooper.jpg", "Data/WordCloudMasks/ahsoka3.jpg",
"Data/WordCloudMasks/yoda_mask2.jpg", "Data/WordCloudMasks/chewy.jpg", "Data/WordCloudMasks/darth_vader2.jpg",
"Data/WordCloudMasks/xwing.jpg", None, "Data/WordCloudMasks/R2D2.png", "Data/WordCloudMasks/bb8.jpg",
"Data/WordCloudMasks/star_destroyer.jpg"]
backgrounds = ["black", "black", "black", "black", "black", "white", "white", "black", "black", "black", "white"]
i = 0
# Make Word Clouds based on TF-IDF for each movie
for m in movies:
print m
df_tfidf = transcript_tokens[m]["TFIDF"]
df_tfidf.index = df_tfidf.word
df_tfidf.TFIDF = np.ceil(df_tfidf.TFIDF.values)
mask = np.array(Image.open(masks[i]))
if coloring[i] is not None:
image_colors = ImageColorGenerator(np.array(Image.open(coloring[i])))
wc = WordCloud(background_color=backgrounds[i], max_words=2000, mask=mask, stopwords=STOPWORDS).generate_from_frequencies(df_tfidf["TFIDF"].to_dict())
# show
fig = plt.figure(figsize=(50,50), frameon=False)
ax = plt.Axes(fig, [0., 0., 1., 1.], )
ax.set_axis_off()
fig.add_axes(ax)
directory = os.path.join("Data", m.replace(" ", "_").replace(":",""))
if not os.path.exists(directory):
os.makedirs(directory)
if coloring[i] is not None:
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.imsave(os.path.join(directory, "wordcloud.png"), wc.recolor(color_func=image_colors))
else:
plt.imshow(wc.recolor(color_func=grey_color_func), interpolation="bilinear")
plt.imsave(os.path.join(directory, "wordcloud.png"), wc.recolor(color_func=grey_color_func))
plt.show()
i+=1
masks = ["Data/WordCloudMasks/cpt_rex_mask.jpg", "Data/WordCloudMasks/sabine2.jpg"]
coloring = ["Data/WordCloudMasks/cpt_rex.jpg", "Data/WordCloudMasks/sabine2.jpg"]
i = 0
# Make Word Clouds based on TF-IDF for each TV-series
for s in animated_series_names:
print s
df_tfidf = transcript_tokens[s]["TFIDF"]
df_tfidf.index = df_tfidf.word
df_tfidf.TFIDF = np.ceil(df_tfidf.TFIDF.values)
mask = np.array(Image.open(masks[i]))
if coloring[i] is not None:
image_colors = ImageColorGenerator(np.array(Image.open(coloring[i])))
wc = WordCloud(background_color=backgrounds[i], max_words=2000, mask=mask, stopwords=STOPWORDS).generate_from_frequencies(df_tfidf["TFIDF"].to_dict())
fig = plt.figure(figsize=(50,50), frameon=False)
ax = plt.Axes(fig, [0., 0., 1., 1.], )
ax.set_axis_off()
fig.add_axes(ax)
directory = os.path.join("Data", s.replace(" ", "_").replace(":",""))
if not os.path.exists(directory):
os.makedirs(directory)
if coloring[i] is not None:
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.imsave(os.path.join(directory, "wordcloud.png"), wc.recolor(color_func=image_colors))
else:
plt.imshow(wc.recolor(color_func=grey_color_func), interpolation="bilinear")
plt.imsave(os.path.join(directory, "wordcloud.png"), wc.recolor(color_func=grey_color_func))
plt.show()
i+=1